library(pollstR)
library(XML)
library(reshape)
library(ggplot2)
library(lme4)
library(WDI)
library(merTools)

# Standard data:
setwd("~/Downloads/Science Replication and Data - Polling/")
dat <- read.csv("global_polling_replication_data.csv", encoding = "latin1")
dat$wts <- scale(1/(dat$days.to.elec+2), center=F) #+ scale(1/((abs(dat$pro.incumbent)+1) ), center=F) +scale(dat$n.elecs, center=F) #should do this by election
dat$pro.incumbent <- scale(dat$pro.incumbent, center=F)

#predicting the US election?
#need for smoothing: polls, pro-incumbent score, growth, pollster, country, region
#need for outcome: polity(10), inflation rate (.8 pct)

######### POLLS ################## from RCP
setwd("/Volumes/TINY CRYPT/papers/Working Projects/Lazer Lab/LL Elections Project/Source Data/")
pls <- read.csv("2016_US_presidential_nightbefore.txt", sep="\t", fileEncoding = "UTF-16")
#pls = pls[1:55, ]
#getting the correct period of the polls - median day before election
pls$d1 = as.Date(paste(unlist(lapply(strsplit(as.character(pls$Date), split = " - "), function(x) x[[1]])), "/16", sep=""), format="%m/%d/%y")
pls$d2 = as.Date(paste(unlist(lapply(strsplit(as.character(pls$Date), split = " - "), function(x) x[[2]])), "/16", sep=""), format="%m/%d/%y")
pls$date <- pls$d1 + round((pls$d2-pls$d1)/2)
pls$days.to.elec <- as.Date("2016-11-07") - pls$date #days before Nov 7.
pls$wts = scale(1/(as.numeric(pls$days.to.elec)+2), center=F)
pls$Spread = gsub("Tie", "0", pls$Spread)
pls$Spread =   as.numeric( gsub("Clinton|Trump|\\+| ", "", as.character(pls$Spread)) )
pls$Pollster = gsub(" ", "", tolower(as.character(pls$Poll)))
# disambiguate pollsters
pollsterdat = dat[which(dat$Pollster %in% gsub(" ", "", tolower(as.character(dat$Pollster[dat$year==2012 | dat$year==2008])))), c("pro.incumbent","incRun", "asp.gdp.bin", "electionid", "Pollster", "l1polity2", "wb.inflation" ) ]
pollsterdat = pollsterdat[!duplicated(pollsterdat$Pollster), ]
pls = merge(pls, pollsterdat, by="Pollster", all.x = T)
pls$pro.incumbent[is.na(pls$pro.incumbent)] = 0
pls$pollmarg = pls$Clinton..D. - pls$Trump..R.
pls$realmarg = 0
pls$year = 2016
#####

# ## STACKING WITH EXISTING DATA::
dat = dat[, c( "realmarg", "year", "pollmarg", "pro.incumbent","incRun", "asp.gdp.bin", "electionid", "Pollster", "l1polity2", "wb.inflation", "wts", "ccode", "region" )]
pls = pls[, names(pls) %in% names(dat)]
## Updating election data to current:::
pls$electionid = "002-2016-1107-P1"
pls$asp.gdp.bin = "(0.0398,0.0621]"
pls$incRun = 0
pls$wb.inflation = .8
pls$ccode = 2
pls$region = dat$region[1]
dat = rbind(dat, pls)

# ## MODELING:::
sm.mod <- lmer(pollmarg ~ pro.incumbent + (pro.incumbent|incRun) + (1|asp.gdp.bin) +(1|electionid)+(1|ccode)+(1|Pollster)+(1|region), na.action=na.exclude, weights=wts, data=dat, control=lmerControl(optimizer="bobyqa", boundary.tol = 1e-2, check.scaleX="silent.rescale"))

dat$sm.pollmarg <- predict(sm.mod)
# Aggregate and predict
library(caret); set.seed(13243)
sm.agg <- aggregate(cbind(sm.pollmarg, pollmarg, realmarg, year, incRun, l1polity2, wb.inflation)~electionid, median, data=dat)

trcontrol <- trainControl(method="timeslice", initialWindow = 50, horizon=1, fixedWindow=F, savePredictions = T)# , indexOut = indexOut
sm.agg <- sm.agg[order(sm.agg$year), ]
tr.stru <- train(realmarg~sm.pollmarg+l1polity2+wb.inflation, data=sm.agg, trControl=trcontrol, metric="RMSE", method="pls", tuneGrid=expand.grid(ncomp=2)) 


## USA Polling DATA:: about a 4 point margin in the popular vote

# overall model with polls
set.seed(235232)
n_sims = 1000
preds <- numeric(n_sims)
# Draw a simulated value, make prediction, save it.
for(i in 1:n_sims){
  simresult = median(predictInterval(sm.mod, newdata=pls, n.sims=1, returnSims=TRUE)$fit)
  preds[i] = predict(tr.stru$finalModel, newdat=data.frame(sm.pollmarg = simresult, l1polity2=10, wb.inflation=.8 ))[,,2]
}

# calculate final measure of probability
sum(preds>0)/length(preds)
qplot(preds) + theme_bw() + xlab("Predictions for Hillary (74.2% probability of winning - 1,000 simulations)") +
  ylab("")

##### DONE